require "open-uri"
require "set"
require "uri"
require "iconv"

def fetch(category, file, log)
  @base_url = "http://movie.douban.com/tag/#{category}"
  @user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"
  @accept_language = 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2,de;q=0.2,zh-TW;q=0.2,ko;q=0.2'
  #@accept_encoding = 'gzip,deflate,sdch'
  @host = 'movie.douban.com'
  @accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
  #@referer = "http://movie.douban.com/tag/?view=ty"
  @cookie = %{__ozlvd1511=1379975502; bid="J9eoExKhsMc"; helperVersion=2014.02.13; viewed="1641034_1084165_3803820_5338024_20383810_1231910_21323941_3288908_1139426_3004255"; ll="118236"; ct=y; push_noty_num=0; push_doumail_num=0; __utma=30149280.664101251.1362700616.1397338805.1397343881.32; __utmb=30149280.53.10.1397343881; __utmc=30149280; __utmz=30149280.1397338805.31.16.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.5320; __utma=223695111.42487028.1385584263.1385584263.1397343881.2; __utmb=223695111.0.10.1397343881; __utmc=223695111; __utmz=223695111.1385584263.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=223695111.|1=Addon=CR%20Y2014.02.13%2040404=1; __utma=223695111.42487028.1385584263.1385584263.1397343881.2; __utmb=223695111.0.10.1397343881; __utmc=223695111; __utmz=223695111.1385584263.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=223695111.|1=Addon=CR%20Y2014.02.13%2040404=1}

  @i= 0
  loop do
    @url = @base_url + "?start=#{@i}&type=R"
    @url= URI::escape(@url)
    p @url
    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
    begin
      open(@url,
           "User-Agent" => @user_agent,
           "Accept-Language" => @accept_language,
           "Accept" => @accept,
           #"Referer" => @referer,
           "Cookie" => @cookie,
           "Host" => @host,
           #"Accept-Encoding" => @accept_encoding
          ) do |http|
        html = http.read
        html = ic.iconv(html)
        begin
          res = html.scan(/movie.douban.com\/subject\/(\d+)/)
          if res.length == 0
            return
          end
          res = Set.new res 
          res.each do |id|
            file.puts id[0]
          end
          file.flush
        rescue
          p "encoding"
          log.puts @url
          log.flush
        end
      end
    rescue => err
      p err
      retry
    end

    sleep(0.5)
    @i = @i + 20
  end
end


categories = [
  "美国",
  "日本",
  "香港",
  "英国",
  "中国",
  "法国",
  "韩国",
  "台湾",
  "德国",
  "意大利",
  "内地",
  "泰国",
  "西班牙",
  "印度",
  "欧洲",
  "加拿大",
  "中国大陆",
  "澳大利亚",
  "伊朗",
  "瑞典",
  "巴西",
  "爱尔兰",
  "波兰",
  "捷克",
  "丹麦",
  "阿根廷",
  "比利时",
  "墨西哥",
  "奥地利",
  "荷兰",
  "匈牙利",
  "土耳其",
  "新西兰",
  "新加坡",
  "以色列",
]

file = File.new('res', 'w')
log = File.new('log', 'w')
categories.each do |category|
  p category
  fetch(category, file, log)
end
log.close
file.close
